There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2018.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2018 can be accessed at page https://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature.
#Loading the rvest and tidyverse package
library("rvest")
## Loading required package: xml2
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature'
#Reading the HTML code from the website
(webpage <- read_html(url))
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="styleguide-v2" class="fixed">\n\n <img height=" ...
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100Use SelectorGadget to find the CSS selector .lister-item-header a.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt1727824/?ref_=adv_li_tt">Bohemian Rhapsody</a>
## [2] <a href="/title/tt4530422/?ref_=adv_li_tt">Overlord</a>
## [3] <a href="/title/tt6966692/?ref_=adv_li_tt">Green Book</a>
## [4] <a href="/title/tt1477834/?ref_=adv_li_tt">Aquaman</a>
## [5] <a href="/title/tt5083738/?ref_=adv_li_tt">The Favourite</a>
## [6] <a href="/title/tt1517451/?ref_=adv_li_tt">A Star Is Born</a>
## [7] <a href="/title/tt3606756/?ref_=adv_li_tt">Incredibles 2</a>
## [8] <a href="/title/tt1571234/?ref_=adv_li_tt">Mortal Engines</a>
## [9] <a href="/title/tt4218572/?ref_=adv_li_tt">Widows</a>
## [10] <a href="/title/tt4532826/?ref_=adv_li_tt">Robin Hood</a>
## [11] <a href="/title/tt6155172/?ref_=adv_li_tt">Roma</a>
## [12] <a href="/title/tt6266538/?ref_=adv_li_tt">Vice</a>
## [13] <a href="/title/tt4633694/?ref_=adv_li_tt">Spider-Man: Into the Spi ...
## [14] <a href="/title/tt4154756/?ref_=adv_li_tt">Avengers: Infinity War</a>
## [15] <a href="/title/tt7349662/?ref_=adv_li_tt">BlacKkKlansman</a>
## [16] <a href="/title/tt2737304/?ref_=adv_li_tt">Bird Box</a>
## [17] <a href="/title/tt4595882/?ref_=adv_li_tt">Can You Ever Forgive Me? ...
## [18] <a href="/title/tt1034415/?ref_=adv_li_tt">Suspiria</a>
## [19] <a href="/title/tt5095030/?ref_=adv_li_tt">Ant-Man and the Wasp</a>
## [20] <a href="/title/tt8359848/?ref_=adv_li_tt">Climax</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Bohemian Rhapsody"
## [2] "Overlord"
## [3] "Green Book"
## [4] "Aquaman"
## [5] "The Favourite"
## [6] "A Star Is Born"
## [7] "Incredibles 2"
## [8] "Mortal Engines"
## [9] "Widows"
## [10] "Robin Hood"
## [11] "Roma"
## [12] "Vice"
## [13] "Spider-Man: Into the Spider-Verse"
## [14] "Avengers: Infinity War"
## [15] "BlacKkKlansman"
## [16] "Bird Box"
## [17] "Can You Ever Forgive Me?"
## [18] "Suspiria"
## [19] "Ant-Man and the Wasp"
## [20] "Climax"
## [21] "The Mule"
## [22] "The Man Who Killed Hitler and Then The Bigfoot"
## [23] "First Man"
## [24] "Black Panther"
## [25] "Hunter Killer"
## [26] "The Girl in the Spider's Web"
## [27] "Venom"
## [28] "Bumblebee"
## [29] "Bad Times at the El Royale"
## [30] "The Ballad of Buster Scruggs"
## [31] "Mary Queen of Scots"
## [32] "Uncle Drew"
## [33] "Solo: A Star Wars Story"
## [34] "Dragon Ball Super: Broly"
## [35] "A Quiet Place"
## [36] "Fantastic Beasts: The Crimes of Grindelwald"
## [37] "Ready Player One"
## [38] "Arctic"
## [39] "A Simple Favor"
## [40] "A Private War"
## [41] "Deadpool 2"
## [42] "The Front Runner"
## [43] "The Grinch"
## [44] "Boy Erased"
## [45] "At Eternity's Gate"
## [46] "Todos lo saben"
## [47] "Tag"
## [48] "Prospect"
## [49] "Mary Poppins Returns"
## [50] "Beautiful Boy"
## [51] "Annihilation"
## [52] "Crazy Rich Asians"
## [53] "Cold War"
## [54] "Mission: Impossible - Fallout"
## [55] "If Beale Street Could Talk"
## [56] "Hereditary"
## [57] "The Nutcracker and the Four Realms"
## [58] "Instant Family"
## [59] "Halloween"
## [60] "Burning"
## [61] "Ralph Breaks the Internet"
## [62] "Ocean's 8"
## [63] "The Sisters Brothers"
## [64] "Creed II"
## [65] "The Predator"
## [66] "Hotel Transylvania 3: Summer Vacation"
## [67] "Replicas"
## [68] "Stan & Ollie"
## [69] "The Hate U Give"
## [70] "On the Basis of Sex"
## [71] "Destroyer"
## [72] "Upgrade"
## [73] "Red Sparrow"
## [74] "The Guilty"
## [75] "The House That Jack Built"
## [76] "Isle of Dogs"
## [77] "Searching"
## [78] "The Old Man & the Gun"
## [79] "Nobody's Fool"
## [80] "Game Night"
## [81] "Capharnaüm"
## [82] "Sicario: Day of the Soldado"
## [83] "Dumplin'"
## [84] "The Meg"
## [85] "Black Mirror: Bandersnatch"
## [86] "Rampage"
## [87] "The Nun"
## [88] "Untogether"
## [89] "To All the Boys I've Loved Before"
## [90] "Then Came You"
## [91] "Jurassic World: Fallen Kingdom"
## [92] "Little Italy"
## [93] "K.G.F: Chapter 1"
## [94] "Eighth Grade"
## [95] "Love, Simon"
## [96] "Peranbu"
## [97] "High Life"
## [98] "Mile 22"
## [99] "Mowgli: Legend of the Jungle"
## [100] "Mandy"# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n The story of the legendary rock band <a ...
## [2] <p class="text-muted">\n A small group of American soldiers find ...
## [3] <p class="text-muted">\n A working-class Italian-American bounce ...
## [4] <p class="text-muted">\n Arthur Curry, the human-born heir to th ...
## [5] <p class="text-muted">\n In early 18th century England, a frail ...
## [6] <p class="text-muted">\n A musician helps a young singer find fa ...
## [7] <p class="text-muted">\n The Incredibles hero family takes on a ...
## [8] <p class="text-muted">\n In a post-apocalyptic world where citie ...
## [9] <p class="text-muted">\n Set in contemporary Chicago, amid a tim ...
## [10] <p class="text-muted">\n A war-hardened Crusader and his Moorish ...
## [11] <p class="text-muted">\n A year in the life of a middle-class fa ...
## [12] <p class="text-muted">\n The story of <a href="/name/nm0155515"> ...
## [13] <p class="text-muted">\n Teen Miles Morales becomes Spider-Man o ...
## [14] <p class="text-muted">\n The Avengers and their allies must be w ...
## [15] <p class="text-muted">\n Ron Stallworth, an African American pol ...
## [16] <p class="text-muted">\n Five years after an ominous unseen pres ...
## [17] <p class="text-muted">\n When Lee Israel falls out of step with ...
## [18] <p class="text-muted">\n A darkness swirls at the center of a wo ...
## [19] <p class="text-muted">\n As Scott Lang balances being both a Sup ...
## [20] <p class="text-muted">\n French dancers gather in a remote, empt ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "\n A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "\n A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "\n Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "\n In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "\n A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.text-muted .runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
## [1] "134 min" "110 min" "130 min" "143 min" "119 min" "136 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 134 110 130 143 119 136# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nBiography, Drama, Music "
## [2] "\nAction, Adventure, Horror "
## [3] "\nBiography, Comedy, Drama "
## [4] "\nAction, Adventure, Fantasy "
## [5] "\nBiography, Comedy, Drama "
## [6] "\nDrama, Music, Romance "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Biography" "Action" "Biography" "Action" "Biography" "Drama"# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "8.2" "6.9" "8.3" "7.4" "7.8" "7.9"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 8.2 6.9 8.3 7.4 7.8 7.9 7.8 6.2 7.1 5.3 7.9 7.1 8.7 8.5 7.5 6.7 7.3
## [18] 7.0 7.1 7.4 7.2 5.6 7.4 7.4 6.6 6.1 6.8 7.1 7.2 7.3 6.5 5.7 7.0 8.3
## [35] 7.6 6.8 7.5 7.3 6.9 6.7 7.8 6.3 6.3 7.0 7.0 7.0 6.6 6.3 7.2 7.3 6.9
## [52] 7.0 7.7 7.8 7.6 7.3 5.5 7.6 6.7 7.7 7.3 6.2 7.0 7.6 5.4 6.3 5.4 7.6
## [69] 7.1 6.5 6.7 7.6 6.6 7.6 7.0 7.9 7.7 6.8 4.4 7.0 8.3 7.1 6.8 5.7 7.4
## [86] 6.1 5.4 6.1 7.3 7.1 6.2 5.7 8.7 7.5 7.7 9.8 6.7 6.1 6.5 6.6# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "244,241" "29,878" "60,794" "163,687" "50,581" "176,405"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 244241 29878 60794 163687 50581 176405 175164 37925 41656 23638
## [11] 72984 30559 94428 583976 104499 180681 11418 23607 193639 11827
## [21] 18515 1292 94010 459800 23478 18243 231555 50103 65200 69737
## [31] 9680 8122 203565 11109 273314 117735 277320 1170 67621 4737
## [41] 346678 2976 23125 11324 6157 11201 71218 2192 32848 21356
## [51] 210740 77897 22080 213750 9628 130083 13550 10211 75064 13550
## [61] 45918 126606 19738 35130 82787 36267 9573 6103 12276 4012
## [71] 3505 89637 127544 17452 24709 95930 78972 18521 2242 149057
## [81] 4708 82980 14647 97179 79774 105649 78155 433 53870 709
## [91] 197317 4497 17772 33425 74808 8218 1231 40887 39310 37007# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0001741/?ref_=adv_li_dr_0">Bryan Singer</a>
## [2] <a href="/name/nm1170339/?ref_=adv_li_dr_0">Julius Avery</a>
## [3] <a href="/name/nm0268380/?ref_=adv_li_dr_0">Peter Farrelly</a>
## [4] <a href="/name/nm1490123/?ref_=adv_li_dr_0">James Wan</a>
## [5] <a href="/name/nm0487166/?ref_=adv_li_dr_0">Yorgos Lanthimos</a>
## [6] <a href="/name/nm0177896/?ref_=adv_li_dr_0">Bradley Cooper</a>
## [7] <a href="/name/nm0083348/?ref_=adv_li_dr_0">Brad Bird</a>
## [8] <a href="/name/nm0729514/?ref_=adv_li_dr_0">Christian Rivers</a>
## [9] <a href="/name/nm2588606/?ref_=adv_li_dr_0">Steve McQueen</a>
## [10] <a href="/name/nm1163264/?ref_=adv_li_dr_0">Otto Bathurst</a>
## [11] <a href="/name/nm0190859/?ref_=adv_li_dr_0">Alfonso Cuarón</a>
## [12] <a href="/name/nm0570912/?ref_=adv_li_dr_0">Adam McKay</a>
## [13] <a href="/name/nm2130108/?ref_=adv_li_dr_0">Bob Persichetti</a>
## [14] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
## [15] <a href="/name/nm0000490/?ref_=adv_li_dr_0">Spike Lee</a>
## [16] <a href="/name/nm0081540/?ref_=adv_li_dr_0">Susanne Bier</a>
## [17] <a href="/name/nm1716636/?ref_=adv_li_dr_0">Marielle Heller</a>
## [18] <a href="/name/nm0345174/?ref_=adv_li_dr_0">Luca Guadagnino</a>
## [19] <a href="/name/nm0715636/?ref_=adv_li_dr_0">Peyton Reed</a>
## [20] <a href="/name/nm0637615/?ref_=adv_li_dr_0">Gaspar Noé</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
head(directors_data)
## [1] "Bryan Singer" "Julius Avery" "Peter Farrelly"
## [4] "James Wan" "Yorgos Lanthimos" "Bradley Cooper"
# Data-Preprocessing: converting directors data into factors
(directors_data <- as.factor(directors_data))
## [1] Bryan Singer Julius Avery Peter Farrelly
## [4] James Wan Yorgos Lanthimos Bradley Cooper
## [7] Brad Bird Christian Rivers Steve McQueen
## [10] Otto Bathurst Alfonso Cuarón Adam McKay
## [13] Bob Persichetti Anthony Russo Spike Lee
## [16] Susanne Bier Marielle Heller Luca Guadagnino
## [19] Peyton Reed Gaspar Noé Clint Eastwood
## [22] Robert D. Krzykowski Damien Chazelle Ryan Coogler
## [25] Donovan Marsh Fede Alvarez Ruben Fleischer
## [28] Travis Knight Drew Goddard Ethan Coen
## [31] Josie Rourke Charles Stone III Ron Howard
## [34] Tatsuya Nagamine John Krasinski David Yates
## [37] Steven Spielberg Joe Penna Paul Feig
## [40] Matthew Heineman David Leitch Jason Reitman
## [43] Yarrow Cheney Joel Edgerton Julian Schnabel
## [46] Asghar Farhadi Jeff Tomsic Christopher Caldwell
## [49] Rob Marshall Felix van Groeningen Alex Garland
## [52] Jon M. Chu Pawel Pawlikowski Christopher McQuarrie
## [55] Barry Jenkins Ari Aster Lasse Hallström
## [58] Sean Anders David Gordon Green Chang-dong Lee
## [61] Phil Johnston Gary Ross Jacques Audiard
## [64] Steven Caple Jr. Shane Black Genndy Tartakovsky
## [67] Jeffrey Nachmanoff Jon S. Baird George Tillman Jr.
## [70] Mimi Leder Karyn Kusama Leigh Whannell
## [73] Francis Lawrence Gustav Möller Lars von Trier
## [76] Wes Anderson Aneesh Chaganty David Lowery
## [79] Tyler Perry John Francis Daley Nadine Labaki
## [82] Stefano Sollima Anne Fletcher Jon Turteltaub
## [85] David Slade Brad Peyton Corin Hardy
## [88] Emma Forrest Susan Johnson Peter Hutchings
## [91] J.A. Bayona Donald Petrie Prashanth Neel
## [94] Bo Burnham Greg Berlanti Ram
## [97] Claire Denis Peter Berg Andy Serkis
## [100] Panos Cosmatos
## 100 Levels: Adam McKay Alex Garland Alfonso Cuarón ... Yorgos Lanthimos# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm1785339/?ref_=adv_li_st_0">Rami Malek</a>
## [2] <a href="/name/nm5381254/?ref_=adv_li_st_0">Jovan Adepo</a>
## [3] <a href="/name/nm0001557/?ref_=adv_li_st_0">Viggo Mortensen</a>
## [4] <a href="/name/nm0597388/?ref_=adv_li_st_0">Jason Momoa</a>
## [5] <a href="/name/nm1469236/?ref_=adv_li_st_0">Olivia Colman</a>
## [6] <a href="/name/nm3078932/?ref_=adv_li_st_0">Lady Gaga</a>
## [7] <a href="/name/nm0005266/?ref_=adv_li_st_0">Craig T. Nelson</a>
## [8] <a href="/name/nm2623492/?ref_=adv_li_st_0">Hera Hilmar</a>
## [9] <a href="/name/nm0205626/?ref_=adv_li_st_0">Viola Davis</a>
## [10] <a href="/name/nm5473782/?ref_=adv_li_st_0">Taron Egerton</a>
## [11] <a href="/name/nm8611957/?ref_=adv_li_st_0">Yalitza Aparicio</a>
## [12] <a href="/name/nm0000288/?ref_=adv_li_st_0">Christian Bale</a>
## [13] <a href="/name/nm4271336/?ref_=adv_li_st_0">Shameik Moore</a>
## [14] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
## [15] <a href="/name/nm0913475/?ref_=adv_li_st_0">John David Washington</a>
## [16] <a href="/name/nm0000113/?ref_=adv_li_st_0">Sandra Bullock</a>
## [17] <a href="/name/nm0565250/?ref_=adv_li_st_0">Melissa McCarthy</a>
## [18] <a href="/name/nm1631269/?ref_=adv_li_st_0">Chloë Grace Moretz</a>
## [19] <a href="/name/nm0748620/?ref_=adv_li_st_0">Paul Rudd</a>
## [20] <a href="/name/nm1154749/?ref_=adv_li_st_0">Sofia Boutella</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Rami Malek" "Jovan Adepo" "Viggo Mortensen" "Jason Momoa"
## [5] "Olivia Colman" "Lady Gaga"
# Data-Preprocessing: converting actors data into factors
(actors_data <- as.factor(actors_data))
## [1] Rami Malek Jovan Adepo Viggo Mortensen
## [4] Jason Momoa Olivia Colman Lady Gaga
## [7] Craig T. Nelson Hera Hilmar Viola Davis
## [10] Taron Egerton Yalitza Aparicio Christian Bale
## [13] Shameik Moore Robert Downey Jr. John David Washington
## [16] Sandra Bullock Melissa McCarthy Chloë Grace Moretz
## [19] Paul Rudd Sofia Boutella Bradley Cooper
## [22] Sam Elliott Ryan Gosling Chadwick Boseman
## [25] Gerard Butler Claire Foy Tom Hardy
## [28] Hailee Steinfeld Jeff Bridges Tim Blake Nelson
## [31] Saoirse Ronan Kyrie Irving Alden Ehrenreich
## [34] Masako Nozawa Emily Blunt Eddie Redmayne
## [37] Tye Sheridan Mads Mikkelsen Anna Kendrick
## [40] Rosamund Pike Ryan Reynolds Hugh Jackman
## [43] Benedict Cumberbatch Lucas Hedges Willem Dafoe
## [46] Penélope Cruz Jeremy Renner Sophie Thatcher
## [49] Emily Blunt Steve Carell Natalie Portman
## [52] Constance Wu Joanna Kulig Tom Cruise
## [55] KiKi Layne Toni Collette Mackenzie Foy
## [58] Mark Wahlberg Jamie Lee Curtis Ah-in Yoo
## [61] John C. Reilly Sandra Bullock John C. Reilly
## [64] Michael B. Jordan Boyd Holbrook Adam Sandler
## [67] Alice Eve Shirley Henderson Amandla Stenberg
## [70] Felicity Jones Nicole Kidman Logan Marshall-Green
## [73] Jennifer Lawrence Jakob Cedergren Matt Dillon
## [76] Bryan Cranston John Cho Robert Redford
## [79] Tiffany Haddish Jason Bateman Zain Al Rafeea
## [82] Benicio Del Toro Danielle Macdonald Jason Statham
## [85] Fionn Whitehead Dwayne Johnson Demián Bichir
## [88] Alice Eve Lana Condor Asa Butterfield
## [91] Chris Pratt Hayden Christensen Yash
## [94] Elsie Fisher Nick Robinson Mammootty
## [97] Robert Pattinson Mark Wahlberg Christian Bale
## [100] Nicolas Cage
## 94 Levels: Adam Sandler Ah-in Yoo Alden Ehrenreich ... Zain Al RafeeaBe careful with missing data.
# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "49 " "60 " "69 " "55 " "90 "
## [6] "88 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 70 78 66
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 81
## [70] 60 62 67 53 83 42 82 71 80 39 66 75 61 53 46 45 46 45 64 43 51 28 90
## [93] 72 81 38 51 81
# Lets check the length of metascore data
length(metascore_data)
## [1] 97
# Visual inspection finds 69, 74, 87 don't have metascore
ms <- rep(NA, 100)
ms[-c(85, 93, 96)] <- metascore_data
(metascore_data <- ms)
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 70 78 66
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 81
## [70] 60 62 67 53 83 42 82 71 80 39 66 75 61 53 46 NA 45 46 45 64 43 51 28
## [93] NA 90 72 NA 81 38 51 81Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the votes data
head(gross_data)
## [1] "$210.67M" "$21.70M" "$61.68M" "$328.69M" "$30.29M" "$208.79M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 85
# Visual inspection finds below movies don't have gross
gs_data <- rep(NA, 100)
gs_data[-c(6, 12, 29, 40, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
(gross_data <- gs_data)
## [1] 210.67 21.70 61.68 328.69 30.29 NA 208.79 608.58 15.95 42.39
## [11] 30.82 NA 45.28 180.04 678.82 48.69 8.55 2.47 216.65 102.64
## [21] 44.94 700.06 15.77 14.84 213.52 125.84 17.84 16.47 NA 42.47
## [31] 213.77 30.38 188.02 159.44 137.69 0.16 53.54 1.63 324.59 NA
## [41] 2.00 270.60 6.79 2.29 0.08 54.55 169.83 7.65 32.73 174.53
## [51] 2.90 220.16 13.79 44.07 54.86 67.36 159.34 0.70 197.59 139.38
## [61] NA 3.14 115.62 51.02 167.51 4.04 4.30 29.72 NA 23.85
## [71] NA 1.47 11.87 NA 46.87 0.21 0.09 NA 32.02 26.02
## [81] 11.28 NA 31.71 NA NA NA NA 69.00 0.74 NA
## [91] 50.07 145.44 99.35 117.44 417.72 0.99 13.54 40.83 36.11 1.21Following code programatically figures out missing entries for metascore.
# Use CSS selectors to scrap the rankings section
(rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
## {xml_nodeset (197)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="metascore mixed">49 </span>
## [3] <span class="lister-item-index unbold text-primary">2.</span>
## [4] <span class="metascore mixed">60 </span>
## [5] <span class="lister-item-index unbold text-primary">3.</span>
## [6] <span class="metascore favorable">69 </span>
## [7] <span class="lister-item-index unbold text-primary">4.</span>
## [8] <span class="metascore mixed">55 </span>
## [9] <span class="lister-item-index unbold text-primary">5.</span>
## [10] <span class="metascore favorable">90 </span>
## [11] <span class="lister-item-index unbold text-primary">6.</span>
## [12] <span class="metascore favorable">88 </span>
## [13] <span class="lister-item-index unbold text-primary">7.</span>
## [14] <span class="metascore favorable">80 </span>
## [15] <span class="lister-item-index unbold text-primary">8.</span>
## [16] <span class="metascore mixed">44 </span>
## [17] <span class="lister-item-index unbold text-primary">9.</span>
## [18] <span class="metascore favorable">84 </span>
## [19] <span class="lister-item-index unbold text-primary">10.</span>
## [20] <span class="metascore unfavorable">32 </span>
## ...
# Convert the ranking data to text
(rank_metascore_data <- html_text(rank_metascore_data_html))
## [1] "1." "49 " "2." "60 " "3."
## [6] "69 " "4." "55 " "5." "90 "
## [11] "6." "88 " "7." "80 " "8."
## [16] "44 " "9." "84 " "10." "32 "
## [21] "11." "96 " "12." "61 " "13."
## [26] "87 " "14." "68 " "15." "83 "
## [31] "16." "51 " "17." "87 " "18."
## [36] "64 " "19." "70 " "20." "83 "
## [41] "21." "58 " "22." "50 " "23."
## [46] "84 " "24." "88 " "25." "43 "
## [51] "26." "43 " "27." "35 " "28."
## [56] "66 " "29." "60 " "30." "79 "
## [61] "31." "60 " "32." "57 " "33."
## [66] "62 " "34." "59 " "35." "82 "
## [71] "36." "52 " "37." "64 " "38."
## [76] "71 " "39." "67 " "40." "75 "
## [81] "41." "66 " "42." "61 " "43."
## [86] "51 " "44." "70 " "45." "78 "
## [91] "46." "66 " "47." "56 " "48."
## [96] "68 " "49." "66 " "50." "62 "
## [101] "51." "79 " "52." "74 " "53."
## [106] "90 " "54." "86 " "55." "87 "
## [111] "56." "87 " "57." "39 " "58."
## [116] "57 " "59." "67 " "60." "90 "
## [121] "61." "71 " "62." "61 " "63."
## [126] "78 " "64." "66 " "65." "48 "
## [131] "66." "54 " "67." "19 " "68."
## [136] "75 " "69." "81 " "70." "60 "
## [141] "71." "62 " "72." "67 " "73."
## [146] "53 " "74." "83 " "75." "42 "
## [151] "76." "82 " "77." "71 " "78."
## [156] "80 " "79." "39 " "80." "66 "
## [161] "81." "75 " "82." "61 " "83."
## [166] "53 " "84." "46 " "85." "86."
## [171] "45 " "87." "46 " "88." "45 "
## [176] "89." "64 " "90." "43 " "91."
## [181] "51 " "92." "28 " "93." "94."
## [186] "90 " "95." "72 " "96." "97."
## [191] "81 " "98." "38 " "99." "51 "
## [196] "100." "81 "
# Strip spaces
(rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
## [1] "1." "49" "2." "60" "3." "69" "4." "55" "5." "90"
## [11] "6." "88" "7." "80" "8." "44" "9." "84" "10." "32"
## [21] "11." "96" "12." "61" "13." "87" "14." "68" "15." "83"
## [31] "16." "51" "17." "87" "18." "64" "19." "70" "20." "83"
## [41] "21." "58" "22." "50" "23." "84" "24." "88" "25." "43"
## [51] "26." "43" "27." "35" "28." "66" "29." "60" "30." "79"
## [61] "31." "60" "32." "57" "33." "62" "34." "59" "35." "82"
## [71] "36." "52" "37." "64" "38." "71" "39." "67" "40." "75"
## [81] "41." "66" "42." "61" "43." "51" "44." "70" "45." "78"
## [91] "46." "66" "47." "56" "48." "68" "49." "66" "50." "62"
## [101] "51." "79" "52." "74" "53." "90" "54." "86" "55." "87"
## [111] "56." "87" "57." "39" "58." "57" "59." "67" "60." "90"
## [121] "61." "71" "62." "61" "63." "78" "64." "66" "65." "48"
## [131] "66." "54" "67." "19" "68." "75" "69." "81" "70." "60"
## [141] "71." "62" "72." "67" "73." "53" "74." "83" "75." "42"
## [151] "76." "82" "77." "71" "78." "80" "79." "39" "80." "66"
## [161] "81." "75" "82." "61" "83." "53" "84." "46" "85." "86."
## [171] "45" "87." "46" "88." "45" "89." "64" "90." "43" "91."
## [181] "51" "92." "28" "93." "94." "90" "95." "72" "96." "97."
## [191] "81" "98." "38" "99." "51" "100." "81"
# a rank followed by another rank means the metascore for the 1st rank is missing
(isrank <- str_detect(rank_metascore_data, "\\.$"))
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [12] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [23] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [34] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [45] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [56] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [67] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [78] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [89] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [100] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [111] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [122] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [133] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [144] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [155] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [166] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [177] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [188] FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
(ismissing <- isrank[1:length(rank_metascore_data)-1] &
isrank[2:length(rank_metascore_data)])
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [188] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
(missingpos <- as.integer(rank_metascore_data[ismissing]))
## [1] 85 93 96
#(rank_metascore_data <- as.integer(rank_metascore_data))You (students) should work out the code for finding missing positions for gross.
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 Bohemian Rhapsody
## 2 2 Overlord
## 3 3 Green Book
## 4 4 Aquaman
## 5 5 The Favourite
## 6 6 A Star Is Born
## 7 7 Incredibles 2
## 8 8 Mortal Engines
## 9 9 Widows
## 10 10 Robin Hood
## Description
## <chr>
## 1 The story of the legendary rock band Queen and lead singer Freddie Merc…
## 2 A small group of American soldiers find horror behind enemy lines on th…
## 3 A working-class Italian-American bouncer becomes the driver of an Afric…
## 4 Arthur Curry, the human-born heir to the underwater kingdom of Atlantis…
## 5 In early 18th century England, a frail Queen Anne occupies the throne a…
## 6 A musician helps a young singer find fame, even as age and alcoholism s…
## 7 The Incredibles hero family takes on a new mission, which involves a ch…
## 8 In a post-apocalyptic world where cities ride on wheels and consume eac…
## 9 Set in contemporary Chicago, amid a time of turmoil, four women with no…
## 10 A war-hardened Crusader and his Moorish commander mount an audacious re…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 134 Biography 8.2 49 244241 211.
## 2 110 Action 6.9 60 29878 21.7
## 3 130 Biography 8.3 69 60794 61.7
## 4 143 Action 7.4 55 163687 329.
## 5 119 Biography 7.8 90 50581 30.3
## 6 136 Drama 7.9 88 176405 NA
## 7 118 Animation 7.8 80 175164 209.
## 8 128 Action 6.2 44 37925 609.
## 9 129 Crime 7.1 84 41656 16.0
## 10 116 Action 5.3 32 23638 42.4
## Director Actor
## <fct> <fct>
## 1 Bryan Singer Rami Malek
## 2 Julius Avery Jovan Adepo
## 3 Peter Farrelly Viggo Mortensen
## 4 James Wan Jason Momoa
## 5 Yorgos Lanthimos Olivia Colman
## 6 Bradley Cooper Lady Gaga
## 7 Brad Bird Craig T. Nelson
## 8 Christian Rivers Hera Hilmar
## 9 Steve McQueen Viola Davis
## 10 Otto Bathurst Taron Egerton
## # … with 90 more rowsHow many top 100 movies are in each genre?
ggplot(movies) +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
(earn_by_genre <- movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)))
## # A tibble: 10 x 2
## Genre avg_earning
## <chr> <dbl>
## 1 Action 103.
## 2 Adventure 147.
## 3 Animation 70.8
## 4 Biography 132.
## 5 Comedy 130.
## 6 Crime 43.4
## 7 Drama 45.5
## 8 Fantasy 2.47
## 9 Horror 198.
## 10 Sci 4.3
ggplot(data = earn_by_genre) +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
## # A tibble: 10 x 11
## # Groups: Genre [10]
## Rank Title Description Runtime Genre Rating Metascore Votes
## <int> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 7 Incr… The Incred… 118 Anim… 7.8 80 175164
## 2 8 Mort… In a post-… 128 Acti… 6.2 44 37925
## 3 15 Blac… Ron Stallw… 135 Biog… 7.5 83 104499
## 4 18 Susp… A darkness… 152 Fant… 7 64 23607
## 5 22 The … A legendar… 98 Adve… 5.6 50 1292
## 6 55 If B… A woman in… 119 Crime 7.6 87 9628
## 7 59 Hall… Laurie Str… 106 Horr… 6.7 67 75064
## 8 60 Burn… Jong-su bu… 148 Drama 7.7 90 13550
## 9 67 Repl… A scientis… 107 Sci 5.4 19 9573
## 10 95 Love… Simon Spie… 110 Come… 7.7 72 74808
## # … with 3 more variables: Gross_Earning_in_Mil <dbl>, Director <fct>,
## # Actor <fct>
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing missing values (geom_point).
Read blog https://www.r-bloggers.com/how-to-scrape-images-from-google/
For example, to download first 10 images from search term “ucla”:
source("scrapeGoogleImages.R")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
### exchange the search terms here!
(gg <- scrapeJSSite(searchTerm = "ucla"))
## images
## 1 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPAHGngfC90w8TVGwxDtF7EOfHRruNfeHqVicOCZHfcFZ28aN9AGMPOrE7
## 2 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQjKYK2GAUPobVyiIe5pKGd9EGrqLaWIdw3a7XskhrW2ezEcFrcWn94-5Pq
## 3 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQzx04u2Ub5IC4UqGukXpcq2Gsozo52nJeQOaI7b8JzgHmTdCkGPgKl3OGX
## 4 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRi4lynSHbXZ4Iw8g2dqSWIHUbwYlVAnCG8JmoJk0m5TDqv7u1A4DZXIXo
## 5 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS1OecHJM-b8-NAvWfsoi8xuOtoHfibWWGCkQ21ZbQM_BELXdODyq5ZAQ
## 6 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ84ERmbLNfXOn48-fXI3kVLCqjBv-mRS1c5y2yinRX1MSy2PuD6ZVIcj7r
## 7 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQtayay8IIEhDm5NyjKj_1Jir0j1YKdHHcUoUJaI0MYV5t7WcK7iK0e_bfN
## 8 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS7PJWOnMu2-F2ZOcKYLiHF6EJM4ddYQas6W7_nEOqv4kmyo5ANyJK-wy1g
## 9 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSi7wafHAE8gTQM7LWdMDi49sfwV9bp4n2-l6MdJ3pODMLa-z83zQZ4j2FY
## 10 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRMEqWVOjAacm0LJTgyltC9U1l9RaZLguWRRBGFfepHhGaRoMaEs2v5iQ
## 11 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQV1joSR-AfZLeCI-glojukNQZjylMEGb7C5vHTc1ZOq-PbJGYreRQXeVY
## 12 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQgDqKCP2Uj7dmIqDKLCzwnO2Nxe4NKkxqi7yzULvq5yIvx9AM5Let8VSfF
## 13 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTI-oSi_FSFDeqOxkX1sse5D2Q-yfDKE28MMq3lOh1B9LckMTTYi-JxJIC8
## 14 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT6SWyqwFbDz5WNWBSbZXABGGTNxP122uQeg1skNxBpv5IBeMCGX4NiSK4
## 15 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcuXKyr-IO6FRGq4mKUqfaZMmVUKdqOFGkC74VFxLv3atIlywjhMvLay5Z
## 16 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQE3k_c0rFd-J7mOLS19TZ4crq6Bx9vXI3gNGPklgoVArW3skuai1ljggub
## 17 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQsP-A6JtaJjfe33HsF5Ng7yRa4HRalvfCBNjERoFxWhvFL_ALJ3UcYtCno
## 18 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT6Vr40BZfh0xMbWCgMRLlHZgPIuZtrEsGMM4FyGu3kEcGLrauySTKCjZ2_
## 19 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSW9Zy18ebhGpc8MZWU6js3GZFQGb9F8cn5IRi1PmKLmJtjhSJ_78zZgmw
## 20 https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRgi-i08PtLlY3n4XJHTZqUfm6OPIqMyDiFKKJ-gYTlyNMs2V49gvm5M7KY
## search
## 1 ucla
## 2 ucla
## 3 ucla
## 4 ucla
## 5 ucla
## 6 ucla
## 7 ucla
## 8 ucla
## 9 ucla
## 10 ucla
## 11 ucla
## 12 ucla
## 13 ucla
## 14 ucla
## 15 ucla
## 16 ucla
## 17 ucla
## 18 ucla
## 19 ucla
## 20 ucla
downloadImages(as.character(gg$images), 1)Let’s peek into the R code:
cat scrapeGoogleImages.R
## library(plyr)
## library(reshape2)
## require(rvest)
##
##
## scrapeJSSite <- function(searchTerm){
## url <- paste0("https://www.google.de/search?q=", searchTerm,
## "&source=lnms&tbm=isch&sa=X")
##
## lines <- readLines("scrapeGoogleImages.js")
## lines[1] <- paste0("var url ='", url, "';")
## writeLines(lines, "scrapeGoogleImages.js")
##
## ## Download website
## system("./phantomjs scrapeGoogleImages.js")
##
## pg <- read_html("1.html")
## files <- pg %>% html_nodes("img") %>% html_attr("src")
## df <- data.frame(images=files, search=searchTerm)
## return(df)
## }
##
##
## downloadImages <- function(files, brand, outPath="images"){
## for(i in 1:length(files)){
## download.file(files[i], destfile = paste0(outPath, "/", brand, "_", i, ".jpg"), mode = 'wb')
## }
##
## }
It substitues the query term in the JavaScript, which then use phantomjs (a barebone, non-GUI browser) to scrape the webpage into 1.html. Then download images in 1.html to the images folder.
ls images/
## 1_1.jpg
## 1_10.jpg
## 1_11.jpg
## 1_12.jpg
## 1_13.jpg
## 1_14.jpg
## 1_15.jpg
## 1_16.jpg
## 1_17.jpg
## 1_18.jpg
## 1_19.jpg
## 1_2.jpg
## 1_20.jpg
## 1_3.jpg
## 1_4.jpg
## 1_5.jpg
## 1_6.jpg
## 1_7.jpg
## 1_8.jpg
## 1_9.jpgquantmod package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
stock <- getSymbols("AAPL", src = "yahoo", auto.assign = FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
head(stock)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03 12.32714 12.36857 11.70000 11.97143 309579900
## 2007-01-04 12.00714 12.27857 11.97429 12.23714 211815100
## 2007-01-05 12.25286 12.31428 12.05714 12.15000 208685400
## 2007-01-08 12.28000 12.36143 12.18286 12.21000 199276700
## 2007-01-09 12.35000 13.28286 12.16429 13.22429 837324600
## 2007-01-10 13.53571 13.97143 13.35000 13.85714 738220000
## AAPL.Adjusted
## 2007-01-03 7.982585
## 2007-01-04 8.159763
## 2007-01-05 8.101658
## 2007-01-08 8.141665
## 2007-01-09 8.817995
## 2007-01-10 9.239983
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)